{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from os import listdir\n",
    "from os.path import isfile, join\n",
    "import numpy as np\n",
    "import matplotlib.image as mpimg\n",
    "from skimage.transform import resize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "  \n",
    "  \n",
    "  \n",
    "#  negative class - Lung images with Pneumonia\n",
    "\n",
    "X_train_neg = []\n",
    "mypath = \"data/train/PNEUMONIA/\"\n",
    "onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]\n",
    "for im_name in onlyfiles:\n",
    "  if \".jpeg\" in im_name:\n",
    "    img = mpimg.imread(mypath+im_name)\n",
    "    img = np.array(img)\n",
    "    img = resize(img, (256,256))\n",
    "    X_train_neg.append(img)\n",
    "\n",
    "    \n",
    "    \n",
    "    \n",
    "# negative labels are zero's\n",
    "\n",
    "y_train_neg = []\n",
    "for i in range(len(X_train_neg)):\n",
    "  y_train_neg.append(1)\n",
    "  \n",
    "  \n",
    "  \n",
    "  \n",
    "#  Some images has three channels, which are the duplicating the grayscale channel, \n",
    "#  so remove additional channels\n",
    "\n",
    "for i in range(len(X_train_neg)):\n",
    "  if len(X_train_neg[i].shape) > 2:\n",
    "    X_train_neg[i] = X_train_neg[i].reshape(-1,256,256)[0]\n",
    "\n",
    "    \n",
    "#  Each list contains numpy arrays of with equal dimensions except first, \n",
    "#  so stack it to one numpy matrix\n",
    "\n",
    "a = np.stack( X_train_pos, axis=0 )\n",
    "b = np.stack( X_train_neg, axis=0 )\n",
    "c = np.stack( y_train_pos, axis=0 )\n",
    "d = np.stack( y_train_neg, axis=0 )\n",
    "\n",
    "# X train file is concatenated positve and negative X classes\n",
    "X_train = np.concatenate((a, b))\n",
    "# the same with Y train file\n",
    "y_train = np.concatenate((c, d))\n",
    "\n",
    "# dereference lists which are no longer needed \n",
    "X_train_pos = None\n",
    "X_train_neg = None\n",
    "y_train_pos = None\n",
    "y_train_neg = None\n",
    "\n",
    "# Force garbage collector call to free memory which is no longer used\n",
    "import gc\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import f1_score\n",
    "\n",
    "#  Randomly split data into train and test sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)\n",
    "\n",
    "\n",
    "#  In order to apply linear models, we need to squeeze each datapoint to one-dimensional array\n",
    "X_train = X_train.reshape(-1, 256*256)\n",
    "X_test = X_test.reshape(-1, 256*256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9463414634146341\n",
      "0.9325790910203876\n",
      "0.9622641509433962\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score\n",
    "\n",
    "# resize to 1D array to apply linear models\n",
    "\n",
    "X_train = X_train.reshape(-1, 256*256)\n",
    "X_test = X_test.reshape(-1, 256*256)\n",
    "\n",
    "\n",
    "clf = LogisticRegression(solver='liblinear')\n",
    "clf.fit(X_train,y_train)\n",
    "\n",
    "y_pred = clf.predict(X_test)\n",
    "\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "auc = roc_auc_score(y_test, y_pred)\n",
    "f1 = f1_score(y_test, y_pred)\n",
    "\n",
    "\n",
    "\n",
    "print(accuracy)\n",
    "print(auc)\n",
    "print(f1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = []\n",
    "mypath = \"data/test/\"\n",
    "for i in range(1757):\n",
    "    img = mpimg.imread(mypath+'%d.jpeg' % (i+1))\n",
    "    img = np.array(img)\n",
    "    img = resize(img, (256,256))\n",
    "    test.append(img)\n",
    "\n",
    "for i in range(len(test)):\n",
    "    if len(test[i].shape) > 2:\n",
    "        test[i] = test[i].reshape(-1,256,256)[0]\n",
    "    \n",
    "x_test = np.stack( test, axis=0 )\n",
    "x_test = x_test.reshape(-1, 256*256)\n",
    "y_pred = clf.predict(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  pred\n",
       "0   1     0\n",
       "1   2     0\n",
       "2   3     1\n",
       "3   4     1\n",
       "4   5     1"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# preds = model.predict(x_test)\n",
    "import pandas as pd\n",
    "submission = pd.DataFrame({'id': range(len(y_pred)), 'pred': y_pred})\n",
    "submission['id'] = submission['id'] + 1\n",
    "submission.to_csv(\"data/ml_submission.csv\", index=False, header=False)\n",
    "submission.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
